# ++++++++++++++++++Popular Packages Section+++++++++++++++++++++++++++++++++++++++++++++
import io
import json
import requests
import urllib2
import datetime
import csv
import time
import nltk
import re
import sklearn
import logging
import seaborn as sns
import numpy as np
import pandas as pd
#++++++++++++++++++++Web Scraping++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
from lxml import html
import urllib2
from bs4 import BeautifulSoup as bsoup
from urllib2 import urlopen
from time import sleep
from selenium import webdriver
#++++++++++++++++++++Word Preparation/Processing Section+++++++++++++++++++++++++++++++++++
# Word Analysis with NLTK; stopwords, FreqDist, word_tokenize
from nltk import tokenize
from nltk.corpus import stopwords
from nltk import FreqDist, word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from pattern.en import sentiment
from collections import Counter
# To ignore all warnings that arise here to enhance clarity
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
#++++++++++++++++++++Data Visualization Section+++++++++++++++++++++++++++++++++++++++++++
# Gensim packages
import gensim
from gensim.corpora import Dictionary
from gensim.models import ldamodel
from gensim import corpora, models, similarities
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.wrappers import LdaVowpalWabbit, LdaMallet
# Other Gensim and Sklearn packages for NFM, LSI, and LDA
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation
from gensim.test.utils import common_corpus, common_dictionary, get_tmpfile
# Network Analytics (clustering)
#to measure distance or similarity between documents or texts
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import cosine_similarity
#clustering
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import ward, dendrogram
from sklearn.cluster import AgglomerativeClustering
# WordCloud, MatPlot & other visualization packages
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
% matplotlib inline
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Selenium setup, using Chrome browser
driver = webdriver.Chrome()
# Chrome opens URL given for K-State Reviews (KSU happens to be the best KS College dorm ranking)
driver.get("https://www.niche.com/colleges/kansas-state-university/reviews/")
# Create array (dict) for reviews
ksu_reviews=[]
# Count the number of reviews for K-State
ksu_count = 0
# Loop to find review text on page and append it to array
while True:
for review in driver.find_elements_by_xpath("//div[@itemprop='reviewBody']"):
ksu_reviews.append(review.text)
ksu_count = ksu_count + 1
try: # Selenium looks for "Next" button for pagination; this is what helps to scrape each page until it runs out of pages
next_link = driver.find_element_by_xpath("//span[@class='icon-arrowright-thin--pagination']")
next_link.click()
time.sleep(3)
except:
break
# Display total number of reviews
print "There are", ksu_count, "reviews for K-State in this dataset."
# save file to txt for use later
with open("data/ksu-reviews.txt", "w") as output:
output.write(str(ksu_reviews))
# You can create a csv file for further analysis using Tabular, Excel, etc.
ksudf = pd.DataFrame(ksu_reviews)
ksudf.to_csv('data/ksu_reviews.csv', encoding='utf-8', header=False, index=False)
# Now we can run analysis on the data.
# checking to see if data is in array
ksudf.head()
# Selenium setup, using Chrome browser
driver = webdriver.Chrome()
# Chrome opens URL given for University of KS, which happens to be the worst KS College dorms rated
driver.get("https://www.niche.com/colleges/university-of-kansas/reviews/")
# Create array (dict) for reviews
ku_reviews=[]
# Count the number of reviews for University of KS
ku_count = 0
# Loop to find review text on page and append it to array
while True:
for review in driver.find_elements_by_xpath("//div[@itemprop='reviewBody']"):
ku_reviews.append(review.text)
ku_count = ku_count + 1
try: # Selenium looks for "Next" button for pagination; this is what helps to scrape each page until it runs out of pages
next_link = driver.find_element_by_xpath("//span[@class='icon-arrowright-thin--pagination']")
next_link.click()
time.sleep(3)
except:
break
# Display total number of reviews
print "There are", ku_count, "reviews for Univ. of Kansas in this dataset."
# save file to txt for use later
with open("data/ku-reviews.txt", "w") as output:
output.write(str(ku_reviews))
# You can create a csv file for further analysis using Tabular, Excel, etc.
kudf = pd.DataFrame(ku_reviews)
kudf.to_csv('data/ku_reviews.csv', encoding='utf-8', header=False, index=False)
# Now we can run analysis on the data.
# checking to see if data is in array
kudf.head()
#specify the url
niche = "https://www.niche.com/colleges/search/best-college-dorms/s/kansas/?type=public"
#Query the website and return the html to the variable 'page'
page = urllib2.urlopen(niche)
#Parse the html in the 'page' variable, and store it in Beautiful Soup format
soup = bsoup(page, "lxml")
n_pretty=soup.prettify()
# scrape URL for all KS colleges listed
schools=[]
all_links = soup.find_all("a")
for link in all_links:
schools.append(link.get("href"))
# get school name
u_title=soup.find_all('h2', class_='search-result__title')
# get school's Niche ranking
u_score=soup.find_all('span', class_='search-result-badge-ordinal')
# combine results to one array for saving
final_score = zip(u_title,u_score)
# changing the data more like Excel format
fsdf = pd.DataFrame(final_score)
# then save
fsdf.to_csv("data/fsdf.csv", encoding='utf-8', header=False, index=False)
# ensure we have the data
fsdf
K-State Housing and Dining Services is always looking to improve the on-campus living areas (residential halls, apartments) by looking at several types of data, including social media. In this instance, we will use Niche to compare KSU to KU by ranking.
Knowing how our current and future residential students actually feel is good or bad about our residential areas on campus can help our department determine when and where to apply updates to facilities.
A potential student for any Kansas college may be determined by the on-campus living areas of the college. We have found that on-campus living does increase total enrollment of the college, success of the student to continue to the next classification, and improve the overall GPA and graduation rates.
Increased enrollment means increased revenue from monies spent for housing, dining, student organizations, books, clothes, etc. for the student during their academic pursuit.
# Word cleaning and processing
# currently the data are in list ... convert to string
tokens = str(ksu_reviews)
tokens2 = str(ku_reviews)
# lowecases
tokens = tokens.lower()
tokens2 = tokens2.lower()
# Remove useless numbers and alphanumerical words
tokens = re.sub("[^a-zA-Z0-9]", " ", tokens)
tokens2 = re.sub("[^a-zA-Z0-9]", " ", tokens2)
# tokenization or word split
tokens = word_tokenize(tokens)
tokens2 = word_tokenize(tokens2)
# get stemming words or lemmas
wordnet_lemmatizer = WordNetLemmatizer()
tokens = (wordnet_lemmatizer.lemmatize(word) for word in tokens)
tokens2 = (wordnet_lemmatizer.lemmatize(word) for word in tokens2)
# Remove stopwords
more_stopwords = set(('span', 'http', 'com', 'edu', 'year', 'abuse', 'campus', 'university', 'school'))
extra_stoplist = set(stopwords.words('english')) | more_stopwords
tokens = (word for word in tokens if word not in extra_stoplist)
tokens2 = (word for word in tokens2 if word not in extra_stoplist)
# Filter non-alphanumeric characters from tokens
tokens = (word for word in tokens if word.isalpha())
tokens2 = (word for word in tokens2 if word.isalpha())
#remove short words
tokens = (word for word in tokens if len(word) >= 3)
tokens2 = (word for word in tokens2 if len(word) >= 3)
# compute frequency distribution for all the bigrams in the text
fdist_ksu = nltk.FreqDist(tokens)
fdist_ku = nltk.FreqDist(tokens2)
# We can convert array to dataframe
ksu_df = pd.DataFrame(tokens)
ku_df = pd.DataFrame(tokens2)
# You can create a csv file for further analysis using Tabular, Excel, etc.
ksu_df.to_csv('data/ksu_df.csv', encoding='utf-8', header=False, index=False)
ku_df.to_csv('data/ku_df.csv', encoding='utf-8', header=False, index=False)
# Now we can run analysis on the data.
# show the 20 most common words
fdist_ksu.most_common(20)
# create plot of word frequency
plt.figure(figsize=(12,8))
fdist_ksu.plot(20)
# Cumulative counts plot of word frequency
fdist_ksu.plot(20,cumulative=True)
# What word in word frequency has the highest count?
print "The word with the greatest number of usage is:", fdist_ksu.max()
# prepare the results of word frequency on review data as a list
freq_word_ksu = []
# two values or columns in fdist_a
for k,v in fdist_ksu.items():
freq_word_ksu.append([k,v])
#make it like an Excel worksheet
wl_ksu = pd.DataFrame(freq_word_ksu)
# View the data of words and their frequencies, first 10 of list
wl_ksu.head(10)
# top five words
wl_sort_ksu = wl_ksu.sort_values(1, ascending=False).head(5)
wl_sort_ksu
# rename column names
wl_ksu = wl_ksu.rename(columns={0: 'word', 1: 'freq'})
wl_ksu.tail(10)
# You can create a csv file for further analysis using Tabular, Excel, etc.
wl_ksu.to_csv('data/wl_ksu.csv', encoding='utf-8', header=False, index=False)
# describe analysis of data
wl_ksu['word'].describe()
# Positive Review word cloud
d = path.dirname("data/")
# Read the whole text.
text = open(path.join(d, 'wl_ksu.txt')).read()
# read the mask image
# taken from http://www.freestencilgallery.com/?s=auto
mask = np.array(Image.open(path.join(d, "speech-bubble-stencil.png")))
# set up stopwords to remove from text
more = set(('span', 'http', 'com', 'edu', 'year', 'abuse', 'campus', 'university', 'school'))
stopwords = set(STOPWORDS) | more
# create wordcloud using png file as background shape
wc = WordCloud(background_color="white", max_words=5000, mask=mask,
stopwords=stopwords)
# generate wordcloud
wc.generate(text)
# show word cloud
plt.figure(figsize=(60,60))
plt.imshow(wc)
plt.axis("off")
print "Here's what people are saying about KSU!"
# save images for later use
plt.savefig("data/ksu_niche_review.png")
plt.savefig("data/ksu_niche_review.pdf")
# show the 20 most common words
fdist_ku.most_common(20)
# create plot of word frequency
plt.figure(figsize=(12,8))
fdist_ku.plot(20)
# Cumulative counts plot of word frequency
fdist_ku.plot(20,cumulative=True)
# What word in word frequency has the highest count?
print "The word with the greatest number of usage is:", fdist_ku.max()
# prepare the results of word frequency on review data as a list
freq_word_ku = []
# two values or columns in fdist_a
for k,v in fdist_ku.items():
freq_word_ku.append([k,v])
#make it like an Excel worksheet
wl_ku = pd.DataFrame(freq_word_ku)
# View data to see the words and their frequency
wl_ku.head(10)
# top five words
wl_sort_ksu = wl_ku.sort_values(1, ascending=False).head(5)
wl_sort_ksu
# rename column names
wl_ku = wl_ku.rename(columns={0: 'word', 1: 'freq'})
wl_ku.tail(10)
# You can create a csv file for further analysis using Tabular, Excel, etc.
wl_ku.to_csv('data/wl_ku.csv', encoding='utf-8', header=False, index=False)
# describe analysis of data
wl_ku['word'].describe()
# Positive Review word cloud
d = path.dirname("data/")
# Read the whole text.
text = open(path.join(d, 'wl_ku.txt')).read()
# read the mask image
# taken from http://www.freestencilgallery.com/?s=auto
mask = np.array(Image.open(path.join(d, "speech-bubble-stencil.png")))
#mask = np.array(Image.open(path.join(d, "jayhawk_head.png")))
# set up stopwords to remove from text
more = set(('span', 'http', 'com', 'edu', 'year', 'abuse', 'kansa', 'campu', 'university', 'school'))
stopwords = set(STOPWORDS) | more
# create wordcloud using png file as background shape
wc = WordCloud(background_color="white", max_words=5000, mask=mask,
stopwords=stopwords)
# generate wordcloud
wc.generate(text)
# show word cloud
plt.figure(figsize=(60,60))
plt.imshow(wc)
plt.axis("off")
print "Here's what people are saying about KU!"
# save images for later use
plt.savefig("data/ku_niche_review.png")
plt.savefig("data/ku_niche_review.pdf")
# read data file for analysis
openfile = io.open("data/ksu_reviews.csv", "rb")
r = csv.reader(openfile)
ksu2_reviews = []
for i in r:
ksu2_reviews.append(i)
openfile.close()
# break out rows and save file
ksuscore=[]
for row in ksu2_reviews:
ksuscore.append(sentiment(row[0]))
output=zip(ksuscore)
writer = csv.writer(io.open('data/ksu_sentiment_score.csv', 'wb'))
writer.writerows(output)
# Sentiment analysis using Pattern
# setup empty lists for reviews
positive_review = []
negative_review = []
neutral_review = []
# test reviews for sentiment score and separate by result, positive, negative, or neutral
for row in ksu2_reviews:
ksureviews = row[0]
PTscore = sentiment(ksureviews)
if PTscore[0] > 0:
positive_review.append(ksureviews)
elif PTscore[0] == 0:
neutral_review.append(ksureviews)
else:
negative_review.append(ksureviews)
# Display results of sentiment anaylysis using Pattern
print "From the dataset, there are", len(positive_review), "positive reviews using Pattern sentiment analysis."
print "From the dataset, there are", len(negative_review), "negative reviews using Pattern sentiment analysis."
print "From the dataset, there are", len(neutral_review), "neutral reviews using Pattern sentiment analysis."
# read data file for analysis
openfile = open("data/ku_reviews.csv", "rb")
r = csv.reader(openfile)
ku2_reviews = []
for i in r:
ku2_reviews.append(i)
openfile.close()
# break out rows and save file
kuscore=[]
for row in ku2_reviews:
kuscore.append(sentiment(row[0]))
output=zip(kuscore)
writer = csv.writer(io.open('data/ks_sentiment_score.csv', 'wb'))
writer.writerows(output)
# Sentiment analysis using Pattern
# setup empty lists for reviews
positive_review2 = []
negative_review2 = []
neutral_review2 = []
# test reviews for sentiment score and separate by result, positive, negative, or neutral
for row in ku2_reviews:
kureviews = row[0]
PTscore = sentiment(kureviews)
if PTscore[0] > 0:
positive_review2.append(kureviews)
elif PTscore[0] == 0:
neutral_review2.append(kureviews)
else:
negative_review2.append(kureviews)
# Display results of sentiment anaylysis using Pattern
print "From the dataset, there are", len(positive_review2), "positive reviews using Pattern sentiment analysis."
print "From the dataset, there are", len(negative_review2), "negative reviews using Pattern sentiment analysis."
print "From the dataset, there are", len(neutral_review2), "neutral reviews using Pattern sentiment analysis."
# open spreadsheet to obtain data for analysis
openfile = open("data/ksu_reviews.csv", "rb")
r = csv.reader(openfile)
ksu_content = []
for i in r:
ksu_content.append(i)
openfile.close()
# display number of documents with content (aka reviews)
len(ksu_content)
# converting csv list to dataframe
ksucon = pd.DataFrame(ksu_content)
ksucon.head()
# determining list shape; will show number of rows, columns
ksucon.shape
# convert the first column to values so as to perform k-value analysis
content1 = ksucon[0].values.tolist()
# we need to find the k value first
# setting random seed to get the same results each time.
np.random.seed(1)
ksu_inertia_scores = []
K = range(2, 7)
for k in K:
X = TfidfVectorizer(stop_words='english', decode_error='ignore').fit_transform(content1)
model = KMeans(n_clusters = k).fit(X)
ksu_inertia_scores.append(model.inertia_)
ksu_inertia_scores
# plot elbow method chart
plt.figure(figsize=(14, 8))
plt.plot(K, ksu_inertia_scores, 'bx-')
plt.xlabel('k')
plt.ylabel('Inertia')
plt.title('Selecting k with the Elbow Method')
# display finding for k-value
print "Now that we know the right k-value to use, let's continue with our analysis on this score."
#k is the number of clusters you're looking for ... this number is usually based on your domain knowledge
np.random.seed(1) # setting random seed to get the same results each time.
k = 3
tfidf_vectorizer = TfidfVectorizer(stop_words='english', decode_error='ignore')
X = tfidf_vectorizer.fit_transform(content1)
model = KMeans(n_clusters=k, n_init=1)
model.fit(X)
# transform X to a matrix ... document term matrix
dtm = X.toarray()
dtm
# transform data matrix to dataframe (Excel)
pd.DataFrame(dtm,columns=tfidf_vectorizer.get_feature_names())
# labels_ : Labels of each point
cluster_name = model.labels_
print cluster_name
# change documents to dataframe
df = pd.DataFrame(content1, columns = ['documents'])
df
# change cluster to dataframe
df1 = pd.DataFrame(model.labels_, columns = ['cluster'])
df1
# join the two dataframe into one
df2 = df.join(df1)
df2
# this shows which words are significant for each cluster
model.cluster_centers_
# T is transpose
pd.DataFrame(model.cluster_centers_, columns=tfidf_vectorizer.get_feature_names()).T
# determine popular word list from cluster
popularwordlist = pd.DataFrame(model.cluster_centers_, columns=tfidf_vectorizer.get_feature_names()).T
popularwordlist.sort_values([0, 1], ascending=[1,0])
# sort data by column 0 in the ascending order
# printing top words
model.cluster_centers_
# here descending order
model.cluster_centers_.argsort()[:, ::-1]
#generate an csv file for this word list (or word frequency analysis) per cluster
#using this word frequency, you can find more information about the documents in each cluster.
wordlist_per_cluster = pd.DataFrame(model.cluster_centers_, columns=tfidf_vectorizer.get_feature_names()).T
wordlist_per_cluster.to_csv("data\wordlist_per_cluster.csv", encoding='utf8', header=False, index=False)
# get top five words for each cluster
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = tfidf_vectorizer.get_feature_names()
for i in range(k):
print "Cluster: ", i,
for ind in order_centroids[i, :5]:
print terms[ind],
print
# further text processing and prep for dendrogram
tfidf_vectorizer = TfidfVectorizer(stop_words='english', decode_error='ignore')
X = tfidf_vectorizer.fit_transform(content1)
dtm = X.toarray()
ward_model = AgglomerativeClustering(n_clusters=k, linkage='ward').fit(dtm)
ward_model.labels_
# create "Dendrogram" plot of clusters
linkage_matrix = ward(dtm)
dendrogram(linkage_matrix, orientation="left", labels=ward_model.labels_)#content1)
plt.figure(figsize=(32,16))
plt.tight_layout() # fixes margins
plt.savefig('data/ksu_ward_clusters.png', dpi=200) #save figure as ward_clusters
https://www.analyticsvidhya.com/blog/2015/10/beginner-guide-web-scraping-beautiful-soup-python/
https://www.crummy.com/software/BeautifulSoup/bs4/doc/
http://www.gregreda.com/2013/03/03/web-scraping-101-with-python/
https://medium.freecodecamp.org/how-to-scrape-websites-with-python-and-beautifulsoup-5946935d93fe
http://selenium-python.readthedocs.io/locating-elements.html